Data QC and preprocessing for microarray data
ed = read.xlsx("ExpDesign.xls", 1, stringsAsFactors=F)
ed$sample_ID=as.character(ed$Sample.Number)
ed$ShortID = ifelse(ed$DS_TumorOrganoids, paste(ed$Condition, ed$Site, ed$patient), paste(ed$Condition, ed$Site, ed$patient, ed$Medium))
rownames(ed) = gsub(".txt","",ed$filename)
setwd("../../Data/Raw/")
agilent.datacolumns=list(E='gMedianSignal',Eb = 'gBGMedianSignal',isNonUniform='gIsFeatNonUnifOL', isPopOutlier='gIsFeatPopnOL');
intensities =read.maimages(paste(ed$filename,sep=""), source="agilent.median", green.only=TRUE, columns=agilent.datacolumns)
## Read US22502595_251485083363_S01_GE1_1105_Oct12_1_1.txt
## Read US22502595_251485083363_S01_GE1_1105_Oct12_1_2.txt
## Read US22502595_251485083363_S01_GE1_1105_Oct12_1_3.txt
## Read US22502595_251485083363_S01_GE1_1105_Oct12_1_4.txt
## Read US22502595_251485083374_S01_GE1_1105_Oct12_1_1.txt
## Read US22502595_251485083374_S01_GE1_1105_Oct12_1_2.txt
## Read US22502595_251485083374_S01_GE1_1105_Oct12_1_3.txt
## Read US22502595_251485083374_S01_GE1_1105_Oct12_1_4.txt
## Read US22502595_251485083362_S01_GE1_1105_Oct12_1_1.txt
## Read US22502595_251485083362_S01_GE1_1105_Oct12_1_2.txt
## Read US22502595_251485083362_S01_GE1_1105_Oct12_1_3.txt
## Read US22502595_251485083362_S01_GE1_1105_Oct12_1_4.txt
## Read US22502595_251485083359_S01_GE1_1105_Oct12_1_1.txt
## Read US22502595_251485083359_S01_GE1_1105_Oct12_1_2.txt
## Read US22502595_251485083359_S01_GE1_1105_Oct12_1_3.txt
## Read US22502595_251485083359_S01_GE1_1105_Oct12_1_4.txt
## Read US22502595_251485083377_S01_GE1_1105_Oct12_1_1.txt
## Read US22502595_251485083377_S01_GE1_1105_Oct12_1_2.txt
## Read US22502595_251485083377_S01_GE1_1105_Oct12_1_3.txt
## Read US22502595_251485083377_S01_GE1_1105_Oct12_1_4.txt
## Read US22502595_251485083378_S01_GE1_1105_Oct12_1_1.txt
## Read US22502595_251485083378_S01_GE1_1105_Oct12_1_2.txt
## Read US22502595_251485083378_S01_GE1_1105_Oct12_1_3.txt
## Read US22502595_251485083378_S01_GE1_1105_Oct12_1_4.txt
## Read US22502595_251485072266_S02_GE1_1105_Oct12_1_1.txt
## Read US22502595_251485072266_S02_GE1_1105_Oct12_1_2.txt
## Read US22502595_251485072266_S02_GE1_1105_Oct12_1_3.txt
## Read US22502595_251485072266_S02_GE1_1105_Oct12_1_4.txt
setwd("../../Code/GeneExpressionMicroarray/")
# fix outdated chip annotations
new_anno_file = "../../Data/Raw/Agilent_14850_annotations_2017-02-08.Rdata"
load(new_anno_file)
old_anno = intensities$genes
take_over_cols = colnames(old_anno)[!colnames(old_anno) %in% c("GeneName","Description","SystematicName")]
tmp = old_anno[,take_over_cols]
tmp$index=1:nrow(tmp)
tmp = merge(tmp, anno_tab_14850, by.x="ProbeName", by.y="ProbeID", all.x=T, sort=F)
new_col_order = c(take_over_cols, colnames(tmp)[!colnames(tmp) %in% take_over_cols])
new_col_order = new_col_order[!new_col_order %in% c("GO_BP","GO_CC","GO_MF")]
new_anno = tmp[order(tmp$index),new_col_order]
intensities$genes = new_anno
intensities$probe_exclude=(intensities$isNonUniform>0)|(intensities$isPopOutlier>0)
intensities$E[intensities$probe_exclude] <- NA
Data overview
This document describes the preprocessing and QC of microarray data from project MB189 tumor/normal tissues and organoids (Mirjana Kessler, Karen Hoffmann) using a single-color hybridization. Micro arrays used had design Agilent 014850 (4x44k Whole Human Genome v1).
Samples
cols_to_exclude = c("filename", "Sample.Number")
ed_filtered = ed[,!colnames(ed) %in% cols_to_exclude]
rownames(ed_filtered) <- NULL
#print.xtable(xtable(ed_filtered,display=rep("s",ncol(ed)-length(cols_to_exclude)+1), align=paste(c(rep("|l",ncol(ed)-length(cols_to_exclude)+1),"|"), collapse="")), type="html", file="" , include.rownames=F)
kable(ed_filtered)
| 251485083363_1_1 |
7 |
Tissue |
Tumor |
none |
OC7-T |
1 |
Tissue Tumor OC7 |
1 |
Tissue Tumor 7 |
| 251485083363_1_2 |
7 |
Organoid |
Tumor |
M1 |
OC7-O |
1 |
Organoid Tumor OC7 |
2 |
Organoid Tumor 7 |
| 251485083363_1_3 |
9 |
Tissue |
Tumor |
none |
OC9-T |
1 |
Tissue Tumor OC9 |
3 |
Tissue Tumor 9 |
| 251485083363_1_4 |
9 |
Organoid |
Tumor |
M1 |
OC9-O |
1 |
Organoid Tumor OC9 |
4 |
Organoid Tumor 9 |
| 251485083374_1_1 |
8 |
Tissue |
Tumor |
none |
OC8-T |
1 |
Tissue Tumor OC8 |
5 |
Tissue Tumor 8 |
| 251485083374_1_2 |
8 |
Organoid |
Tumor |
M1 |
OC8-O |
1 |
Organoid Tumor OC8 |
6 |
Organoid Tumor 8 |
| 251485083374_1_3 |
5 |
Organoid |
Tumor |
M1 |
OC5-O |
1 |
Organoid Tumor OC5 |
7 |
Organoid Tumor 5 |
| 251485083374_1_4 |
5 |
Tissue |
Tumor |
none |
OC5-T |
1 |
Tissue Tumor OC5 |
8 |
Tissue Tumor 5 |
| 251485083362_1_1 |
4 |
Tissue |
Tumor |
none |
OC4-T |
1 |
Tissue Tumor OC4 |
9 |
Tissue Tumor 4 |
| 251485083362_1_2 |
4 |
Organoid |
Tumor |
M1 |
OC4-O |
1 |
Organoid Tumor OC4 |
10 |
Organoid Tumor 4 |
| 251485083362_1_3 |
6 |
Tissue |
Tumor |
none |
OC6-T |
1 |
Tissue Tumor OC6 |
11 |
Tissue Tumor 6 |
| 251485083362_1_4 |
6 |
Organoid |
Tumor |
M1 |
OC6-O |
1 |
Organoid Tumor OC6 |
12 |
Organoid Tumor 6 |
| 251485083359_1_1 |
1 |
Organoid |
Normal |
StandardFT |
FT OC1-O |
1 |
Organoid Normal OC1 |
13 |
Organoid Normal 1 |
| 251485083359_1_2 |
2 |
Organoid |
Normal |
StandardFT |
FT OC2-O |
1 |
Organoid Normal OC2 |
14 |
Organoid Normal 2 |
| 251485083359_1_3 |
2 |
Organoid |
Tumor |
M1 |
OC2-O |
1 |
Organoid Tumor OC2 |
15 |
Organoid Tumor 2 |
| 251485083359_1_4 |
1 |
Organoid |
Tumor |
M1 |
OC1-O |
1 |
Organoid Tumor OC1 |
16 |
Organoid Tumor 1 |
| 251485083377_1_1 |
217 |
Tissue |
Normal |
none |
FT217-T |
1 |
Tissue Normal FT217 |
17 |
Tissue Normal 217 |
| 251485083377_1_2 |
217 |
Organoid |
Normal |
StandardFT |
FT217-O |
1 |
Organoid Normal FT217 |
18 |
Organoid Normal 217 |
| 251485083377_1_3 |
223 |
Tissue |
Normal |
none |
FT223-T |
1 |
Tissue Normal FT223 |
19 |
Tissue Normal 223 |
| 251485083377_1_4 |
223 |
Organoid |
Normal |
StandardFT |
FT223-O |
1 |
Organoid Normal FT223 |
20 |
Organoid Normal 223 |
| 251485083378_1_1 |
231 |
Tissue |
Normal |
none |
FT231-T |
1 |
Tissue Normal FT231 |
21 |
Tissue Normal 231 |
| 251485083378_1_2 |
231 |
Organoid |
Normal |
StandardFT |
FT231-O |
1 |
Organoid Normal FT231 |
22 |
Organoid Normal 231 |
| 251485083378_1_3 |
1 |
Tissue |
Tumor |
none |
OC1-T |
1 |
Tissue Tumor OC1 |
23 |
Tissue Tumor 1 |
| 251485083378_1_4 |
2 |
Tissue |
Tumor |
none |
OC2-T |
1 |
Tissue Tumor OC2 |
24 |
Tissue Tumor 2 |
| 251485072266_1_1 |
7 |
Organoid |
Tumor |
M1 |
OC7-O |
0 |
Organoid Tumor OC7-M1 |
25 |
Organoid Tumor 7 M1 |
| 251485072266_1_2 |
7 |
Organoid |
Tumor |
M1_Wnt |
OC7-O |
0 |
Organoid Tumor OC7-M1_Wnt |
26 |
Organoid Tumor 7 M1_Wnt |
| 251485072266_1_3 |
4 |
Organoid |
Tumor |
M1 |
OC4-O |
0 |
Organoid Tumor OC4-M1 |
27 |
Organoid Tumor 4 M1 |
| 251485072266_1_4 |
4 |
Organoid |
Tumor |
M1_Wnt |
OC4-O |
0 |
Organoid Tumor OC4-M1_Wnt |
28 |
Organoid Tumor 4 M1_Wnt |
Raw intensity data
Intensity distribution across samples
par(mfrow=c(2,1), mar=c(11,4,4,1))
ii = log2(intensities$E)
colnames(ii) = ed[colnames(ii),]$ShortID
boxplot(ii, las=2, main = "Raw FG intensities", ylim=c(0,20))
ii = log2(intensities$Eb)
colnames(ii) = ed[colnames(ii),]$ShortID
boxplot(ii, las=2, main = "Raw BG intensities", ylim=c(0,20))
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 25 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 26 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 27 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 28 is not drawn

par(mfrow=c(1,1))
plotDensities(intensities, group = ed[colnames(intensities$E),"ShortID"], legend="topright", main="Intensity distribution by experiment run ID")
## Warning in plotDensities.EListRaw(intensities, group =
## ed[colnames(intensities$E), : NaNs produced

There are around 1500-2500 probes with background equal to zero in the last 4 arrays (M1 and M1_Wnt comparison) - not clear why this happened. Since we anyway will estimate the background from all probes using a statistical model below we will ignore this issue.
Correlation of raw intensities across samples
raw_cor = cor(log2(intensities$E),method="spearman", use="pairwise")
colnames(raw_cor) = ed[colnames(raw_cor),]$ShortID
#rownames(raw_cor) = ed[rownames(raw_cor),]$sample_ID
pheatmap(raw_cor, cluster_rows = F, cluster_cols=F)

Preprocessing
Data will be preprocessed as follows:
- Background for each array will be corrected using the “normexp” method (Ritchie et al 2007, Bioinformatics, p. 2700-07) and an offset of 15.
- Between-array normalization will be perfomed using the quantile method (Bolstad et al 2003, Bioinformatics, p. 185).
Background correction
bg_corrected = backgroundCorrect(intensities,method="normexp",offset=15)
Intensity distribution across samples after background correction
par(mar=c(12,4,4,1))
boxplot(log2(bg_corrected$E), las=2, names = ed[colnames(bg_corrected$E), "ShortID"], main = "BG-corrected FG intensities", ylim=c(-2,20))

Normalization
normalized = normalizeBetweenArrays(bg_corrected,method="quantile")
Intensity distribution across samples after BG correction and normalization
boxplot(log2(normalized$E), las=2, names = ed[colnames(normalized$E), "ShortID"],main = "Normalized FG intensities", ylim=c(-2,20))

Correlation of normalized intensities across samples
norm_cor = cor(log2(normalized$E),method="spearman", use="pairwise")
colnames(norm_cor) = ed[colnames(norm_cor),]$sample_ID
rownames(norm_cor) = ed[rownames(norm_cor),]$ShortID
pheatmap(norm_cor)

Primary Component Analysis
norm_exp = normalized$E
NA_rows = apply(norm_exp,1,function(x) sum(is.na(x)))
pca = prcomp(t(norm_exp[NA_rows==0,]))
imp = summary(pca)$importance
cp = palette(rainbow(8))
plot(pca$x[,1], pca$x[,2], type="p", xlab=paste("1st PC (",round(imp[2,1],2)*100 ,"% var explained)",sep="" ), ylab=paste("2nd PC (",round(imp[2,2],2)*100 ,"% var explained)",sep="" ), main="PCA on normalized expression data", xlim=c(min(pca$x[,1])*1.5,max(pca$x[,1])*4))
text(pca$x[,1],pca$x[,2],labels=ed[colnames(normalized$E),]$ShortID, col=cp[as.numeric(as.factor(ed[colnames(normalized$E),]$Condition))], adj=-0.05)
abline(h=0)
abline(v=0)

Control probes
The following control probes exist on the arrays used in this experiment:
- Corner associated (used for orientation purposes during scanning)
- Bright corner
- Dark corner
- Negative controls
- 3xSLv1 (hairpin probe that does not hybridize well with any possible RNA)
- Positive controls
- Human GAPDH and PGK1 probes
- Deletion stringency probes (DCP, probe with varying number of insertions/changes with respect to reference; the number after the “_" denotes the number of differences to the reference which should correlate with lower expression)
- E1A_r60: spike-in probes with concentrations that should cover the whole dynamic range of the array
There are a few other expression probes that are used by Agilent’s feature extraction/QC pipeline.
control_probes = which(intensities$genes$ControlType!=0)
cp_data = intensities$E[control_probes,]
cp_names = intensities$genes[control_probes,]
selected_controls = ifelse(substr(cp_names$ProbeName,1,4)=="ERCC",F,T)
# control probes
for (i in 1:ncol(cp_data)) {
boxplot(log2(cp_data[selected_controls,i]) ~ factor(cp_names$ProbeName[selected_controls]),las=2, main=paste("Sample",i), outline=F)
}




























Number of detected probes
Signal for probes will be considered detected if the expression of a probe is above the 95% quantile of expression in negative control probes. In a typical experiment one would expect about 50% of all genes to be detectable.
neg95 <- apply(normalized$E[normalized$genes$ControlType==-1,],2,function(x) quantile(x,p=0.95, na.rm=T))
cutoff <- matrix(1.1*neg95,nrow(normalized),ncol(normalized),byrow=TRUE)
isexpr <- rowSums(normalized$E > cutoff, na.rm=T) >= 2
# about 50% of genes should be detected
#table(isexpr)
isexpr_sample <- colSums(normalized$E > cutoff, na.rm=T)
sample_order = order(ed[names(isexpr_sample),]$ShortID)
isexpr_sample_ordered = isexpr_sample[sample_order]
nn = ed[names(isexpr_sample),]$ShortID[sample_order]
par(mar=c(15,4,4,2))
barplot(isexpr_sample_ordered, main="number of detected probes per sample", las=2, names.arg=nn)

par(mar=c(8,4,4,2))
save(ed, intensities, normalized, file="../../Data/Processed/Tumor_organoids_micro_array_preprocessed_data.Rdata")